{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#coding:utf-8\n",
    "import jieba\n",
    "\n",
    "# 创建停用词list\n",
    "def stopwordslist(filepath):\n",
    "    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]\n",
    "    return stopwords\n",
    "\n",
    "# 对句子进行分词\n",
    "def seg_sentence(sentence, cut_engine, stopwords):\n",
    "    sentence_seged = cut_engine.cut(sentence.strip())\n",
    "    seg_list = []\n",
    "    for word in sentence_seged:\n",
    "        if word not in stopwords:\n",
    "            seg_list.append(word)\n",
    "    return seg_list\n",
    "\n",
    "# 得分函数，分数越大，属于标题SEO优化越大\n",
    "def getScore(sen_list, positions_labeled_dict, use_SEO_algorithm):\n",
    "    if(use_SEO_algorithm):        \n",
    "        tmp = set()\n",
    "        for word in sen_list:\n",
    "            if word in positions_labeled_dict:\n",
    "                tmp.add(positions_labeled_dict[word])\n",
    "        return len(tmp)\n",
    "    else:\n",
    "        count = 0\n",
    "        for word in sen_list:\n",
    "            if word in positions_labeled_dict:\n",
    "                count += 1\n",
    "        return count\n",
    "\n",
    "# 如果是SEO标题返回1，否则返回0\n",
    "def anti_SEO_algorithm(sentence, threshold, cut_engine, stopwords, positions_labeled_dict, use_SEO_algorithm=True):\n",
    "    # 分词\n",
    "    sen_lis = seg_sentence(sentence.lower(), cut_engine, stopwords)\n",
    "    score = getScore(sen_lis, positions_labeled_dict, use_SEO_algorithm)\n",
    "    if(score >= threshold):\n",
    "        return 1\n",
    "    else:\n",
    "        return 0\n",
    "    \n",
    "def combination_strategy_SEO_title_recognition(sentence, threshold1, threshold2, cut_engine, stopwords, positions_labeled_dict):\n",
    "    # 分词\n",
    "    sen_list = seg_sentence(sentence.lower(), cut_engine, stopwords)\n",
    "    tmp = set()\n",
    "    count2 = 0\n",
    "    for word in sen_list:\n",
    "        if word in positions_labeled_dict:\n",
    "            count2 += 1\n",
    "            tmp.add(positions_labeled_dict[word])\n",
    "    count1 = len(tmp)\n",
    "    if(count1 >= threshold1):\n",
    "        return 1\n",
    "    elif(count2 >= threshold2):\n",
    "        return 1\n",
    "    else:\n",
    "        return 0    \n",
    "    \n",
    "def solveSingleFile(inputFile, outputFile1, outputFile0, cut_engine, stopwords, positions_labeled_dict, threshold, use_SEO_algorithm=True):\n",
    "    fout1 = open(outputFile1, \"w\", encoding=\"utf-8\")\n",
    "    fout0 = open(outputFile0, \"w\", encoding=\"utf-8\")\n",
    "    with open(inputFile,\"rb\") as fin:\n",
    "        line = fin.readline().decode('utf-8').lower().strip()\n",
    "        while(line):\n",
    "            # sentence = line.split(sep=\"\\u0001\")[1]\n",
    "            res = anti_SEO_algorithm(line, threshold, cut_engine, stopwords, positions_labeled_dict, use_SEO_algorithm)\n",
    "            if(res == 1):\n",
    "                fout1.write(line)\n",
    "                fout1.write(\"\\n\")\n",
    "            if(res == 0):\n",
    "                fout0.write(line)\n",
    "                fout0.write(\"\\n\")\n",
    "            line = fin.readline().decode('utf-8').lower().strip()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\jieba.cache\n",
      "Loading model cost 2.597 seconds.\n",
      "Prefix dict has been built succesfully.\n"
     ]
    }
   ],
   "source": [
    "# 加载分词用的外部字典\n",
    "zhiliandict = \"./my_dict/zhilian.dict\"\n",
    "jieba.load_userdict(zhiliandict)\n",
    "stopwords = stopwordslist('./stopwords/stopwords.txt')  # 这里加载停用词的路径"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['前端', 'php', 'ios', 'java', 'ui', 'vc++', '测试', '安卓']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 分词测试\n",
    "sen = \"前端/PHP/IOS/JAVA/UI/VC++/测试/安卓\".lower()\n",
    "sen_lis = seg_sentence(sen, jieba, stopwords)\n",
    "sen_lis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Anaconda3\\lib\\site-packages\\gensim\\utils.py:865: UserWarning: detected Windows; aliasing chunkize to chunkize_serial\n",
      "  warnings.warn(\"detected Windows; aliasing chunkize to chunkize_serial\")\n"
     ]
    }
   ],
   "source": [
    "# 逻辑回归\n",
    "# 01 加载词向量字典\n",
    "import gensim\n",
    "model_path = \"./model/job_title_word2vec.model\"\n",
    "model = gensim.models.Word2Vec.load(model_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "# sen2vec函数\n",
    "def sen2vec(sen, cut_engine, stopwords, word_vec):\n",
    "    sen_lis = seg_sentence(sen, cut_engine, stopwords)\n",
    "    res = np.zeros([150], dtype=np.float32)\n",
    "    for word in sen_lis:\n",
    "        if word in word_vec:\n",
    "            res += word_vec[word]\n",
    "    return res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 构建训练集\n",
    "POS_path = './data/POS.txt'\n",
    "NEG_path = './data/NEG.txt'\n",
    "with open(POS_path, encoding='utf-8') as POS_in:\n",
    "    POS_lis = [sen2vec(line.strip(), jieba, stopwords, model.wv) for line in POS_in]\n",
    "with open(NEG_path, encoding='utf-8') as NEG_in:\n",
    "    NEG_lis = [sen2vec(line.strip(), jieba, stopwords, model.wv) for line in NEG_in]\n",
    "y_POS = [1] * len(POS_lis)\n",
    "y_NEG = [0] * len(NEG_lis)\n",
    "X_lis = POS_lis + NEG_lis\n",
    "y_lis = y_POS + y_NEG\n",
    "X_train = np.array(X_lis, dtype=np.float32)\n",
    "y_train = np.array(y_lis, dtype=np.float32)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 打乱训练集顺序\n",
    "from sklearn.utils import shuffle\n",
    "X_train, y_train = shuffle(X_train, y_train, random_state=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LogisticRegression(C=1.0, class_weight='balanced', dual=False,\n",
       "          fit_intercept=True, intercept_scaling=1, max_iter=100,\n",
       "          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,\n",
       "          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 精确率、召回率、F1\n",
    "from sklearn.linear_model.logistic import LogisticRegression\n",
    "from sklearn.cross_validation import train_test_split, cross_val_score\n",
    "classifier = LogisticRegression(class_weight='balanced')\n",
    "classifier.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "精确率： 0.647606625335 [ 0.64868701  0.63964912  0.65509761  0.64415215  0.65044723]\n",
      "召回率： 0.977951899217 [ 0.9827957   0.98010753  0.97419355  0.97471759  0.97794513]\n",
      "综合评价指标： 0.779194955631 [ 0.78153057  0.77409766  0.78339818  0.77568493  0.78126343]\n"
     ]
    }
   ],
   "source": [
    "# 结果评价\n",
    "precisions = cross_val_score(classifier, X_train, y_train, cv=5, scoring='precision') \n",
    "print('精确率：', np.mean(precisions), precisions)\n",
    "recalls = cross_val_score(classifier, X_train, y_train, cv=5, scoring='recall')\n",
    "print('召回率：', np.mean(recalls), recalls)\n",
    "f1s = cross_val_score(classifier, X_train, y_train, cv=5, scoring='f1')\n",
    "print('综合评价指标：', np.mean(f1s), f1s)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['./LR_model/lr_model_final2.m']"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 保存LR训练好的模型\n",
    "from sklearn.externals import joblib\n",
    "joblib.dump(classifier, \"./LR_model/lr_model_final2.m\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# LR 预测函数\n",
    "def LR_predict(sen, lr_model, cut_engine, stopwords, word_vec):\n",
    "    sen_vec = sen2vec(sen, cut_engine, stopwords, word_vec).reshape(1,150)\n",
    "    return lr_model.predict(sen_vec)[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def solveSingleFile_LR(inputFile, outputFile1, outputFile0, lr_model, cut_engine, stopwords, word_vec):\n",
    "    fout1 = open(outputFile1, \"w\", encoding=\"utf-8\")\n",
    "    fout0 = open(outputFile0, \"w\", encoding=\"utf-8\")\n",
    "    with open(inputFile,\"rb\") as fin:\n",
    "        line = fin.readline().decode('utf-8').lower().strip()\n",
    "        while(line):\n",
    "            res = LR_predict(line, lr_model, cut_engine, stopwords, word_vec)\n",
    "            if(res == 1):\n",
    "                fout1.write(line)\n",
    "                fout1.write(\"\\n\")\n",
    "            if(res == 0):\n",
    "                fout0.write(line)\n",
    "                fout0.write(\"\\n\")\n",
    "            line = fin.readline().decode('utf-8').lower().strip()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "inputFile = \"./data/NEG_test.txt\"\n",
    "outputFile1 = \"./data/NEG_test_1_new.txt\"\n",
    "outputFile0 = \"./data/NEG_test_0_new.txt\"\n",
    "solveSingleFile_LR(inputFile,outputFile1,outputFile0,classifier,jieba,stopwords,model.wv)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "positions_labeled = [[line.strip().split(\" \")[0], int(line.strip().split(\" \")[1])] for line in open(\"./positions_kmeans_result/positions_labeled_115_0926.txt\", 'r', encoding='utf-8').readlines()]\n",
    "positions_labeled_dict = dict(positions_labeled) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2109"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(positions_labeled_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sen1 = \"前端/PHP/IOS/JAVA/UI/VC++/测试/安卓\"\n",
    "sen2 = \"高级Java开发工程师（中关村\"\n",
    "sen = \"开发2部 Java编程实习生+五险一金+包住宿\"\n",
    "combination_strategy_SEO_title_recognition(sen, 3, 4, jieba, stopwords, positions_labeled_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "java_title_list = [line.strip() for line in open(\"./job_title/000001_0\", 'r', encoding='utf-8').readlines()]\n",
    "fout1 = open(\"./job_title/000001_1\", \"w\", encoding=\"utf-8\")\n",
    "fout0 = open(\"./job_title/000001_0\", \"w\", encoding=\"utf-8\")\n",
    "for line in java_title_list:\n",
    "    res = combination_strategy_SEO_title_recognition(line, 3, 5, jieba, stopwords, positions_labeled_dict)\n",
    "    if(res == 1):\n",
    "        fout1.write(line)\n",
    "        fout1.write(\"\\n\")\n",
    "    if(res == 0):\n",
    "        fout0.write(line)\n",
    "        fout0.write(\"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['\\ufeffJAVA讲师',\n",
       " 'Java开发工程师',\n",
       " '前端/PHP/IOS/JAVA/UI/VC++/测试/安卓',\n",
       " 'PHP中级工程师',\n",
       " 'Java工程师',\n",
       " 'Java研发工程师',\n",
       " 'Java开发工程师（实习生',\n",
       " '高级Java开发工程师（中关村）',\n",
       " '高级java工程师（技术中心）',\n",
       " '开发2部 Java编程实习生+五险一金+包住宿',\n",
       " 'Java开发工程师包住宿',\n",
       " '研发部二 招聘Java初级工程师',\n",
       " '研发三部高薪Java开发(无经验/转行均可）',\n",
       " '高级JAVA开发工程师',\n",
       " '中级/高级Java工程师',\n",
       " 'java开发工程师',\n",
       " 'java开发工程师',\n",
       " 'java开发工程师',\n",
       " '开发2部170 java软件开发实习生高薪定岗',\n",
       " '高级java开发工程师',\n",
       " 'JAVA高级开发工程师（云平台开发）',\n",
       " 'JAVA开发工程师',\n",
       " 'java代码工程师（信息安全方向）',\n",
       " '高级JAVA开发工程师',\n",
       " 'Java工程师',\n",
       " '高级JAVA开发工程师',\n",
       " 'Java开发工程师（初中高级）',\n",
       " 'JAVA软件开发实习生转正8千起薪',\n",
       " 'JAVA华锐科技定岗实习生+转正五险一金',\n",
       " 'java工程师（寿险核心开发）',\n",
       " 'JAVA高级开发工程师',\n",
       " 'Java开发工程师',\n",
       " 'JAVA高级工程师',\n",
       " 'Java高级工程师',\n",
       " 'Java工程师',\n",
       " 'Java 技术经理/技术专家',\n",
       " '高级Java工程师',\n",
       " 'java开发工程师',\n",
       " '招聘软件专业 ★应届生实习 ★0基础培养JAVA★ 包住宿',\n",
       " 'Java高级工程师',\n",
       " 'Java高级工程师',\n",
       " 'java高级工程师',\n",
       " 'Web前端开发（Java基础）',\n",
       " 'Cisco Software Engineer (Java)--英语流利',\n",
       " '项目组2 招java软件开发+年底双薪',\n",
       " '研发部1 8kJava软件开发工程师',\n",
       " 'JAVA开发工程师',\n",
       " 'JAVA开发工程师（石景山区）',\n",
       " 'Java工程师',\n",
       " 'java初级开发工程师',\n",
       " '转行IT（JAVA软件开发） 双休+4K-6K',\n",
       " '诚聘Java开发人员',\n",
       " 'Java高级开发工程师',\n",
       " '急聘JAVA开发工程师',\n",
       " '其他专业转行IT行业java开发方向',\n",
       " 'Java服务器开发（游戏）',\n",
       " '项目组2 零基础Java工程师（Java/安卓）',\n",
       " '高级Java开发/资深Java开发',\n",
       " 'java开发工程师【每年涨薪/带薪年假/旅游】',\n",
       " '中级java开发工程师+五险一金+不限学历']"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "java_title_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# inputFile = \"./job_title/000001_0\"\n",
    "# outputFile1 = \"./job_title/000001_0_1\"\n",
    "# outputFile0 = \"./job_title/000001_0_0\"\n",
    "inputFile = \"./data/NEG_test_1_new.txt\"\n",
    "outputFile1 = \"./data/NEG_test_1_new_1_3.txt\"\n",
    "outputFile0 = \"./data/NEG_test_1_new_0_3.txt\"\n",
    "solveSingleFile(inputFile, outputFile1, outputFile0, jieba, stopwords, positions_labeled_dict, 4)"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
